#packages
library(tidyverse)
## Warning: a(z) 'ggplot2' csomag az R 4.4.3 verziójával lett fordítva
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(janitor)
## Warning: a(z) 'janitor' csomag az R 4.4.3 verziójával lett fordítva
##
## Kapcsolódás csomaghoz: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(lubridate)
library(readr)
library(psych)
##
## Kapcsolódás csomaghoz: 'psych'
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(ggplot2)
library(dplyr)
library(cowplot)
##
## Kapcsolódás csomaghoz: 'cowplot'
##
## The following object is masked from 'package:lubridate':
##
## stamp
library(lme4)
## A szükséges csomag betöltődik: Matrix
##
## Kapcsolódás csomaghoz: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
library(sjPlot)
## Learn more about sjPlot with 'browseVignettes("sjPlot")'.
##
## Kapcsolódás csomaghoz: 'sjPlot'
##
## The following objects are masked from 'package:cowplot':
##
## plot_grid, save_plot
library(performance)
#loading data
spotify_songs <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2020/2020-01-21/spotify_songs.csv')
## Rows: 32833 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): track_id, track_name, track_artist, track_album_id, track_album_na...
## dbl (13): track_popularity, danceability, energy, key, loudness, mode, speec...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#exploring data, checking for mistakes
glimpse(spotify_songs)
## Rows: 32,833
## Columns: 23
## $ track_id <chr> "6f807x0ima9a1j3VPbc7VN", "0r7CVbZTWZgbTCYdfa…
## $ track_name <chr> "I Don't Care (with Justin Bieber) - Loud Lux…
## $ track_artist <chr> "Ed Sheeran", "Maroon 5", "Zara Larsson", "Th…
## $ track_popularity <dbl> 66, 67, 70, 60, 69, 67, 62, 69, 68, 67, 58, 6…
## $ track_album_id <chr> "2oCs0DGTsRO98Gh5ZSl2Cx", "63rPSO264uRjW1X5E6…
## $ track_album_name <chr> "I Don't Care (with Justin Bieber) [Loud Luxu…
## $ track_album_release_date <chr> "2019-06-14", "2019-12-13", "2019-07-05", "20…
## $ playlist_name <chr> "Pop Remix", "Pop Remix", "Pop Remix", "Pop R…
## $ playlist_id <chr> "37i9dQZF1DXcZDD7cfEKhW", "37i9dQZF1DXcZDD7cf…
## $ playlist_genre <chr> "pop", "pop", "pop", "pop", "pop", "pop", "po…
## $ playlist_subgenre <chr> "dance pop", "dance pop", "dance pop", "dance…
## $ danceability <dbl> 0.748, 0.726, 0.675, 0.718, 0.650, 0.675, 0.4…
## $ energy <dbl> 0.916, 0.815, 0.931, 0.930, 0.833, 0.919, 0.8…
## $ key <dbl> 6, 11, 1, 7, 1, 8, 5, 4, 8, 2, 6, 8, 1, 5, 5,…
## $ loudness <dbl> -2.634, -4.969, -3.432, -3.778, -4.672, -5.38…
## $ mode <dbl> 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, …
## $ speechiness <dbl> 0.0583, 0.0373, 0.0742, 0.1020, 0.0359, 0.127…
## $ acousticness <dbl> 0.10200, 0.07240, 0.07940, 0.02870, 0.08030, …
## $ instrumentalness <dbl> 0.00e+00, 4.21e-03, 2.33e-05, 9.43e-06, 0.00e…
## $ liveness <dbl> 0.0653, 0.3570, 0.1100, 0.2040, 0.0833, 0.143…
## $ valence <dbl> 0.518, 0.693, 0.613, 0.277, 0.725, 0.585, 0.1…
## $ tempo <dbl> 122.036, 99.972, 124.008, 121.956, 123.976, 1…
## $ duration_ms <dbl> 194754, 162600, 176616, 169093, 189052, 16304…
colSums(is.na(spotify_songs))
## track_id track_name track_artist
## 0 5 5
## track_popularity track_album_id track_album_name
## 0 0 5
## track_album_release_date playlist_name playlist_id
## 0 0 0
## playlist_genre playlist_subgenre danceability
## 0 0 0
## energy key loudness
## 0 0 0
## mode speechiness acousticness
## 0 0 0
## instrumentalness liveness valence
## 0 0 0
## tempo duration_ms
## 0 0
#There are five tracks where the song title and artist are missing, and for these the popularity – our main variable – is 0, which is suspicious. I will remove these from the dataset.
data <- spotify_songs %>%
drop_na()
#distribution checks for numeric variables
hist(data$track_popularity)
hist(data$danceability)
hist(data$energy)
hist(data$loudness)
hist(data$speechiness)
hist(data$acousticness)
hist(data$instrumentalness)
hist(data$liveness)
hist(data$valence)
hist(data$tempo)
hist(data$duration_ms)
# distribution of categorical variables
pie(table(data$playlist_genre))
pie(table(data$playlist_subgenre))
pie(table(data$mode))
pie(table(data$key))
#descriptive statistics
data %>%
select(track_popularity, danceability, energy, key, loudness, speechiness, acousticness, instrumentalness, liveness, valence, tempo, duration_ms, playlist_genre, playlist_subgenre, mode) %>%
describe()
## vars n mean sd median trimmed mad
## track_popularity 1 32828 42.48 24.98 45.00 43.01 26.69
## danceability 2 32828 0.65 0.15 0.67 0.66 0.15
## energy 3 32828 0.70 0.18 0.72 0.71 0.19
## key 4 32828 5.37 3.61 6.00 5.35 4.45
## loudness 5 32828 -6.72 2.99 -6.17 -6.41 2.52
## speechiness 6 32828 0.11 0.10 0.06 0.09 0.04
## acousticness 7 32828 0.18 0.22 0.08 0.13 0.11
## instrumentalness 8 32828 0.08 0.22 0.00 0.02 0.00
## liveness 9 32828 0.19 0.15 0.13 0.16 0.07
## valence 10 32828 0.51 0.23 0.51 0.51 0.27
## tempo 11 32828 120.88 26.90 121.98 119.13 26.75
## duration_ms 12 32828 225796.83 59836.49 216000.00 220378.37 47246.01
## playlist_genre* 13 32828 3.44 1.71 3.00 3.43 2.97
## playlist_subgenre* 14 32828 12.60 6.79 12.00 12.60 8.90
## mode 15 32828 0.57 0.50 1.00 0.58 0.00
## min max range skew kurtosis se
## track_popularity 0.00 100.00 100.00 -0.23 -0.93 0.14
## danceability 0.00 0.98 0.98 -0.50 0.01 0.00
## energy 0.00 1.00 1.00 -0.64 0.00 0.00
## key 0.00 11.00 11.00 -0.02 -1.31 0.02
## loudness -46.45 1.27 47.72 -1.36 4.49 0.02
## speechiness 0.00 0.92 0.92 1.97 4.26 0.00
## acousticness 0.00 0.99 0.99 1.59 1.88 0.00
## instrumentalness 0.00 0.99 0.99 2.76 6.27 0.00
## liveness 0.00 1.00 1.00 2.08 5.07 0.00
## valence 0.00 0.99 0.99 -0.01 -0.90 0.00
## tempo 0.00 239.44 239.44 0.53 0.08 0.15
## duration_ms 4000.00 517810.00 513810.00 1.15 2.70 330.25
## playlist_genre* 1.00 6.00 5.00 0.01 -1.27 0.01
## playlist_subgenre* 1.00 24.00 23.00 0.02 -1.18 0.04
## mode 0.00 1.00 1.00 -0.27 -1.93 0.00
#correlation matrix for numeric variables
num_data <- data[sapply(data, is.numeric)]
cor(num_data, use = "pairwise.complete.obs")
## track_popularity danceability energy key
## track_popularity 1.0000000000 0.064753904 -0.108983765 -0.0004048699
## danceability 0.0647539039 1.000000000 -0.086073841 0.0117706083
## energy -0.1089837648 -0.086073841 1.000000000 0.0099724054
## key -0.0004048699 0.011770608 0.009972405 1.0000000000
## loudness 0.0577172461 0.025351421 0.676661769 0.0009196045
## mode 0.0105531999 -0.058711007 -0.004778384 -0.1739811674
## speechiness 0.0070670368 0.181808452 -0.032184394 0.0224621227
## acousticness 0.0850421483 -0.024514628 -0.539732195 0.0043780155
## instrumentalness -0.1500033849 -0.008657809 0.033282124 0.0060217005
## liveness -0.0545933642 -0.123899361 0.161316603 0.0028337090
## valence 0.0332775379 0.330538138 0.151050467 0.0199327942
## tempo -0.0055383266 -0.184132454 0.150072270 -0.0133158906
## duration_ms -0.1436342100 -0.096921719 0.012560486 0.0151412053
## loudness mode speechiness acousticness
## track_popularity 0.0577172461 0.010553200 0.007067037 0.085042148
## danceability 0.0253514208 -0.058711007 0.181808452 -0.024514628
## energy 0.6766617694 -0.004778384 -0.032184394 -0.539732195
## key 0.0009196045 -0.173981167 0.022462123 0.004378015
## loudness 1.0000000000 -0.019242161 0.010312605 -0.361646363
## mode -0.0192421607 1.000000000 -0.063446138 0.009399022
## speechiness 0.0103126054 -0.063446138 1.000000000 0.026167720
## acousticness -0.3616463627 0.009399022 0.026167720 1.000000000
## instrumentalness -0.1478231457 -0.006759530 -0.103385447 -0.006880668
## liveness 0.0775888723 -0.005485080 0.055337384 -0.077247436
## valence 0.0534109143 0.002566773 0.064755765 -0.016832604
## tempo 0.0937612756 0.014338514 0.044649018 -0.112781539
## duration_ms -0.1150394799 0.015575548 -0.089432207 -0.081552898
## instrumentalness liveness valence tempo
## track_popularity -0.150003385 -0.054593364 0.033277538 -0.005538327
## danceability -0.008657809 -0.123899361 0.330538138 -0.184132454
## energy 0.033282124 0.161316603 0.151050467 0.150072270
## key 0.006021700 0.002833709 0.019932794 -0.013315891
## loudness -0.147823146 0.077588872 0.053410914 0.093761276
## mode -0.006759530 -0.005485080 0.002566773 0.014338514
## speechiness -0.103385447 0.055337384 0.064755765 0.044649018
## acousticness -0.006880668 -0.077247436 -0.016832604 -0.112781539
## instrumentalness 1.000000000 -0.005505231 -0.175405779 0.023302804
## liveness -0.005505231 1.000000000 -0.020431976 0.020886604
## valence -0.175405779 -0.020431976 1.000000000 -0.025638719
## tempo 0.023302804 0.020886604 -0.025638719 1.000000000
## duration_ms 0.063256032 0.006196648 -0.032291758 -0.001347396
## duration_ms
## track_popularity -0.143634210
## danceability -0.096921719
## energy 0.012560486
## key 0.015141205
## loudness -0.115039480
## mode 0.015575548
## speechiness -0.089432207
## acousticness -0.081552898
## instrumentalness 0.063256032
## liveness 0.006196648
## valence -0.032291758
## tempo -0.001347396
## duration_ms 1.000000000
# Large overview figure: relationship between track popularity and numeric variables
base_theme <- theme_minimal(base_size = 12) +
theme(
plot.title = element_text(face = "bold", size = 11),
axis.title = element_text(face = "bold"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
)
# Energy × track popularity
p_energy <- ggplot(data, aes(x = energy, y = track_popularity)) +
geom_point(color = "#4E79A7", alpha = 0.3, size = 1.3) +
geom_smooth(method = "lm", se = TRUE,
color = "black", fill = "grey80", linewidth = 0.9) +
labs(x = "Energy", y = "Track popularity", title = "Energy") +
base_theme
# Danceability × track popularity
p_dance <- ggplot(data, aes(x = danceability, y = track_popularity)) +
geom_point(color = "#59A14F", alpha = 0.3, size = 1.3) +
geom_smooth(method = "lm", se = TRUE,
color = "black", fill = "grey80", linewidth = 0.9) +
labs(x = "Danceability", y = "Track popularity", title = "Danceability") +
base_theme
# Loudness × track popularity
p_loudness <- ggplot(data, aes(x = loudness, y = track_popularity)) +
geom_point(color = "#9C755F", alpha = 0.3, size = 1.3) +
geom_smooth(method = "lm", se = TRUE,
color = "black", fill = "grey80", linewidth = 0.9) +
labs(x = "Loudness", y = "Track popularity", title = "Loudness") +
base_theme
# Speechiness × track popularity
p_speechiness <- ggplot(data, aes(x = speechiness, y = track_popularity)) +
geom_point(color = "#F28E2B", alpha = 0.3, size = 1.3) +
geom_smooth(method = "lm", se = TRUE,
color = "black", fill = "grey80", linewidth = 0.9) +
labs(x = "Speechiness", y = "Track popularity", title = "Speechiness") +
base_theme
# Acousticness × track popularity
p_acousticness <- ggplot(data, aes(x = acousticness, y = track_popularity)) +
geom_point(color = "#76B7B2", alpha = 0.3, size = 1.3) +
geom_smooth(method = "lm", se = TRUE,
color = "black", fill = "grey80", linewidth = 0.9) +
labs(x = "Acousticness", y = "Track popularity", title = "Acousticness") +
base_theme
# Instrumentalness × track popularity
p_instrumentalness <- ggplot(data, aes(x = instrumentalness, y = track_popularity)) +
geom_point(color = "#EDC948", alpha = 0.3, size = 1.3) +
geom_smooth(method = "lm", se = TRUE,
color = "black", fill = "grey80", linewidth = 0.9) +
labs(x = "Instrumentalness", y = "Track popularity", title = "Instrumentalness") +
base_theme
# Liveness × track popularity
p_liveness <- ggplot(data, aes(x = liveness, y = track_popularity)) +
geom_point(color = "#B07AA1", alpha = 0.3, size = 1.3) +
geom_smooth(method = "lm", se = TRUE,
color = "black", fill = "grey80", linewidth = 0.9) +
labs(x = "Liveness", y = "Track popularity", title = "Liveness") +
base_theme
# Valence × track popularity
p_valence <- ggplot(data, aes(x = valence, y = track_popularity)) +
geom_point(color = "#E15759", alpha = 0.3, size = 1.3) +
geom_smooth(method = "lm", se = TRUE,
color = "black", fill = "grey80", linewidth = 0.9) +
labs(x = "Valence", y = "Track popularity", title = "Valence") +
base_theme
# Tempo × track popularity
p_tempo <- ggplot(data, aes(x = tempo, y = track_popularity)) +
geom_point(color = "#FF9DA7", alpha = 0.25, size = 1.2) +
geom_smooth(method = "lm", se = TRUE,
color = "black", fill = "grey80", linewidth = 0.9) +
labs(x = "Tempo", y = "Track popularity", title = "Tempo") +
base_theme
# Duration × track popularity
p_duration_ms <- ggplot(data, aes(x = duration_ms, y = track_popularity)) +
geom_point(color = "#BAB0AC", alpha = 0.25, size = 1.2) +
geom_smooth(method = "lm", se = TRUE,
color = "black", fill = "grey80", linewidth = 0.9) +
labs(x = "Duration (ms)", y = "Track popularity", title = "Duration") +
base_theme
# Combined overview of the plots
plot_numvar <- cowplot::plot_grid(
p_energy, p_dance,
p_loudness, p_speechiness,
p_acousticness, p_instrumentalness,
p_liveness, p_valence,
p_tempo, p_duration_ms,
labels = c("A", "B", "C", "D", "E", "F", "G", "H", "I", "J"),
ncol = 2
)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
plot_numvar
# Overview of how categorical variables relate to track popularity
# Playlist subgenre × track popularity (mean ± SD)
p_subgenre_bar <- data %>%
group_by(playlist_subgenre) %>%
summarise(
mean_pop = mean(track_popularity, na.rm = TRUE),
sd_pop = sd(track_popularity, na.rm = TRUE),
.groups = "drop"
) %>%
ggplot(aes(x = reorder(playlist_subgenre, mean_pop), y = mean_pop)) +
geom_col(fill = "grey85", color = "black") +
geom_errorbar(aes(ymin = mean_pop - sd_pop, ymax = mean_pop + sd_pop), width = 0.2) +
coord_flip() +
labs(
x = "Playlist subgenre",
y = "Mean track popularity (± SD)",
title = "Playlist subgenre"
) +
base_theme
p_subgenre_bar
# Playlist genre × track popularity
p_genre <- ggplot(data, aes(x = playlist_genre, y = track_popularity)) +
geom_boxplot(
aes(color = playlist_genre),
outlier.shape = NA
) +
geom_jitter(
aes(color = playlist_genre),
width = 0.2,
alpha = 0.05,
size = 0.9
) +
labs(
x = "Playlist genre",
y = "Track popularity",
title = "Playlist genre"
) +
base_theme +
theme(
axis.text.x = element_text(angle = 45, hjust = 1)
)
p_genre
# Mode (major vs. minor) × track popularity
p_mode <- ggplot(data, aes(x = factor(mode), y = track_popularity)) +
geom_boxplot(
aes(color = factor(mode)),
outlier.shape = NA
) +
geom_jitter(
aes(color = factor(mode)),
width = 0.15,
alpha = 0.05,
size = 0.9
) +
scale_x_discrete(labels = c("Minor", "Major")) +
scale_color_viridis_d(guide = "none") +
labs(
x = "Mode",
y = "Track popularity",
title = "Mode"
) +
base_theme
p_mode
#Analysis plan
#Imagine that we are aspiring musicians who also know a bit of statistics, but despite all our efforts, we just cannot manage to produce a true banger hit. Instead of guessing what might work, we decide to turn to data and let statistics tell us what a song should be like in order to become as popular as possible.
#Following this idea, in this project I build and compare linear mixed-effects models to identify which musical features are the best predictors of track popularity. Mixed-effects models are particularly suitable for this task, as multiple tracks in the dataset are produced by the same artist, violating the assumption of independent observations required by standard linear regression. By including artist as a random effect, the models account for this dependency while allowing us to focus on the general relationships between song characteristics and popularity.
# Predictors were selected to reflect musical features that can be actively adjusted by musicians (assuming that I cannot/do not want to change myself, so genre and instruments are taken as given). As energy, danceability, valence, tempo, speechiness and mode was used as predictors.
# In addition, I am curious whether it is necessary to consider multiple musical features, or whether it is sufficient that a track is easy to dance to. Therefore, I also build a simpler model to compare it with the more complex one.
#Assmuption checks for LMM ##building the model
#Mean-centering continuous predictors
data <- data %>%
mutate(
energy_c = scale(energy, scale = FALSE),
danceability_c = scale(danceability, scale = FALSE),
valence_c = scale(valence, scale = FALSE),
tempo_c = scale(tempo, scale = FALSE),
speechiness_c = scale(speechiness, scale = FALSE)
)
#Mode as categorical predictor
data$mode_f <- factor(data$mode, labels = c("Minor", "Major"))
#LMM with artist as random intercept
spotify_model <- lmer(
track_popularity ~
energy_c +
danceability_c +
valence_c +
tempo_c +
speechiness_c +
mode_f +
(1 | track_artist),
data = data
)
##checking linearity and homoskedaticity on a residuals vs. fitted plot
#linearity check with scatterplots
plot_numvar
#the variables seems to be ok in terms of liearity
#residuals vs. fitted plot
diag_df <- data.frame(
fitted = fitted(spotify_model),
resid = resid(spotify_model)
)
ggplot(diag_df, aes(x = fitted, y = resid)) +
geom_point(color = "grey40", alpha = 0.4, size = 1.3) +
geom_hline(yintercept = 0, linetype = "dashed", color = "black") +
labs(
x = "Fitted values",
y = "Residuals",
title = "Linearity & homoskedasticity check: residuals vs fitted values"
) +
base_theme
#Homoscedasticity seems to be violated, with a slight funnel-shaped pattern in the residuals. Track popularity is skewed, with many values close to zero. To handle this, I apply a log transformation to the outcome variable.
# Log-transform track popularity to reduce skewness and heteroscedasticity
data <- data %>%
mutate(
track_popularity_log = log(track_popularity + 1)
)
# LMM with log-transformed outcome
spotify_modell_log <- lmer(
track_popularity_log ~
energy_c +
danceability_c +
valence_c +
tempo_c +
speechiness_c +
mode_f +
(1 | track_artist),
data = data
)
#checkyng residuals vs. fitted plot again
#residuals vs. fitted plot
diag_df <- data.frame(
fitted = fitted(spotify_modell_log),
resid = resid(spotify_modell_log)
)
ggplot(diag_df, aes(x = fitted, y = resid)) +
geom_point(color = "grey40", alpha = 0.4, size = 1.3) +
geom_hline(yintercept = 0, linetype = "dashed", color = "black") +
labs(
x = "Fitted values",
y = "Residuals",
title = "Linearity & homoskedasticity check: residuals vs fitted values"
) +
base_theme
# The log transformation did not improve the residual pattern. I therefore return to the original model with untransformed track popularity. LMMs are fairly robust, so this violation is handled as a limitation of the analysis.
##checking the normality of the residuals
#Q–Q plot of residuals
diag_df <- data.frame(resid = resid(spotify_model))
ggplot(diag_df, aes(sample = resid)) +
stat_qq(color = "grey40", alpha = 0.6) +
stat_qq_line(color = "black") +
labs(
x = "Theoretical quantiles",
y = "Sample quantiles",
title = "Residual Q–Q plot"
) +
base_theme
#histogram of the residuals
ggplot(diag_df, aes(x = resid)) +
geom_histogram(bins = 30, fill = "grey85", color = "black") +
labs(
x = "Residuals",
y = "Count",
title = "Residual distribution"
) +
base_theme
#based on the plots, normality of the resiusals seems to be ok
##Model convergence and random-effect variance
#Model convergence and random-effect variance
summary(spotify_model)
## Linear mixed model fit by REML ['lmerMod']
## Formula: track_popularity ~ energy_c + danceability_c + valence_c + tempo_c +
## speechiness_c + mode_f + (1 | track_artist)
## Data: data
##
## REML criterion at convergence: 296314
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -4.0723 -0.5193 0.1095 0.6230 3.2771
##
## Random effects:
## Groups Name Variance Std.Dev.
## track_artist (Intercept) 182.6 13.51
## Residual 389.9 19.75
## Number of obs: 32828, groups: track_artist, 10692
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 38.278893 0.241162 158.727
## energy_c -13.868005 0.803975 -17.249
## danceability_c 3.480819 1.065912 3.266
## valence_c 6.058650 0.636827 9.514
## tempo_c 0.006744 0.004769 1.414
## speechiness_c -0.667654 1.405035 -0.475
## mode_fMajor 0.813036 0.250380 3.247
##
## Correlation of Fixed Effects:
## (Intr) enrgy_ dncbl_ vlnc_c temp_c spchn_
## energy_c 0.027
## danceblty_c -0.020 0.095
## valence_c 0.001 -0.220 -0.337
## tempo_c 0.001 -0.105 0.182 -0.030
## speechnss_c -0.015 0.010 -0.058 -0.043 -0.100
## mode_fMajor -0.585 0.009 0.017 0.000 -0.012 0.035
isSingular(spotify_model, tol = 1e-4)
## [1] FALSE
# The extended model converged properly, is not singular, and shows meaningful random-effect variance, indicating that the model assumptions are sufficiently met.
#interpreation of the complex model
spotify_model
## Linear mixed model fit by REML ['lmerMod']
## Formula: track_popularity ~ energy_c + danceability_c + valence_c + tempo_c +
## speechiness_c + mode_f + (1 | track_artist)
## Data: data
## REML criterion at convergence: 296314
## Random effects:
## Groups Name Std.Dev.
## track_artist (Intercept) 13.51
## Residual 19.75
## Number of obs: 32828, groups: track_artist, 10692
## Fixed Effects:
## (Intercept) energy_c danceability_c valence_c tempo_c
## 38.278893 -13.868005 3.480819 6.058650 0.006744
## speechiness_c mode_fMajor
## -0.667654 0.813036
summary(spotify_model)
## Linear mixed model fit by REML ['lmerMod']
## Formula: track_popularity ~ energy_c + danceability_c + valence_c + tempo_c +
## speechiness_c + mode_f + (1 | track_artist)
## Data: data
##
## REML criterion at convergence: 296314
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -4.0723 -0.5193 0.1095 0.6230 3.2771
##
## Random effects:
## Groups Name Variance Std.Dev.
## track_artist (Intercept) 182.6 13.51
## Residual 389.9 19.75
## Number of obs: 32828, groups: track_artist, 10692
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 38.278893 0.241162 158.727
## energy_c -13.868005 0.803975 -17.249
## danceability_c 3.480819 1.065912 3.266
## valence_c 6.058650 0.636827 9.514
## tempo_c 0.006744 0.004769 1.414
## speechiness_c -0.667654 1.405035 -0.475
## mode_fMajor 0.813036 0.250380 3.247
##
## Correlation of Fixed Effects:
## (Intr) enrgy_ dncbl_ vlnc_c temp_c spchn_
## energy_c 0.027
## danceblty_c -0.020 0.095
## valence_c 0.001 -0.220 -0.337
## tempo_c 0.001 -0.105 0.182 -0.030
## speechnss_c -0.015 0.010 -0.058 -0.043 -0.100
## mode_fMajor -0.585 0.009 0.017 0.000 -0.012 0.035
tab_model(spotify_model, show.std = TRUE)
| track popularity | |||||
|---|---|---|---|---|---|
| Predictors | Estimates | std. Beta | CI | standardized CI | p |
| (Intercept) | 38.28 | -0.17 | 37.81 – 38.75 | -0.19 – -0.15 | <0.001 |
| energy c | -13.87 | -0.10 | -15.44 – -12.29 | -0.11 – -0.09 | <0.001 |
| danceability c | 3.48 | 0.02 | 1.39 – 5.57 | 0.01 – 0.03 | 0.001 |
| valence c | 6.06 | 0.06 | 4.81 – 7.31 | 0.04 – 0.07 | <0.001 |
| tempo c | 0.01 | 0.01 | -0.00 – 0.02 | -0.00 – 0.02 | 0.157 |
| speechiness c | -0.67 | -0.00 | -3.42 – 2.09 | -0.01 – 0.01 | 0.635 |
| mode f [Major] | 0.81 | 0.03 | 0.32 – 1.30 | 0.01 – 0.05 | 0.001 |
| Random Effects | |||||
| σ2 | 389.88 | ||||
| τ00 track_artist | 182.61 | ||||
| ICC | 0.32 | ||||
| N track_artist | 10692 | ||||
| Observations | 32828 | ||||
| Marginal R2 / Conditional R2 | 0.014 / 0.329 | ||||
# Model summary:
# Artist-level differences are large (ICC ~ 0.32), so using an LMM with artist as a random effect is justified.
#Fixed effects (within-artist associations):
#- Energy: strong negative association with popularity (higher energy -> lower popularity).
#- Danceability: small positive association (more danceable -> slightly higher popularity).
#- Valence: clear positive association (more positive mood -> higher popularity).
#- Mode: major is slightly more popular than minor.
#- Tempo and speechiness: not significant once the other predictors are in the model.
# Overall: the audio features explain only a small part of popularity (marginal R2 ~ 0.01),
# while adding artist differences boosts explained variance a lot (conditional R2 ~ 0.33).
# Overall, as a musician, I would write a song that avoids extreme energy, focuses on a positive mood and good danceability, leans slightly toward a major key, and does not worry too much about tempo or spoken elements.
#simple model
#building the model
# Simple model: focusing only on danceability
spotify_model_simple <- lmer(
track_popularity ~
danceability_c +
(1 | track_artist),
data = data
)
# 1) Linearity and homoscedasticity: residuals vs fitted values
diag_simple <- data.frame(
fitted = fitted(spotify_model_simple),
resid = resid(spotify_model_simple)
)
ggplot(diag_simple, aes(x = fitted, y = resid)) +
geom_point(color = "grey40", alpha = 0.4, size = 1.3) +
geom_hline(yintercept = 0, linetype = "dashed", color = "black") +
labs(
x = "Fitted values",
y = "Residuals",
title = "Simple model: residuals vs fitted values"
) +
base_theme
# Homoscedasticity, similarly to the previous model, is not perfect.
#Model convergence and random-effect variance
summary(spotify_model_simple)
## Linear mixed model fit by REML ['lmerMod']
## Formula: track_popularity ~ danceability_c + (1 | track_artist)
## Data: data
##
## REML criterion at convergence: 296650.3
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -4.0831 -0.5157 0.1104 0.6214 3.3255
##
## Random effects:
## Groups Name Variance Std.Dev.
## track_artist (Intercept) 187.2 13.68
## Residual 392.9 19.82
## Number of obs: 32828, groups: track_artist, 10692
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 38.844 0.197 197.158
## danceability_c 7.179 0.989 7.259
##
## Correlation of Fixed Effects:
## (Intr)
## danceblty_c -0.013
isSingular(spotify_model_simple, tol = 1e-4)
## [1] FALSE
#The simple model converged properly, is not singular, and shows meaningful random-effect variance, indicating that the model specification is appropriate from an assumptions perspective.
#Q–Q plot of residuals
diag_df <- data.frame(resid = resid(spotify_model_simple))
ggplot(diag_df, aes(sample = resid)) +
stat_qq(color = "grey40", alpha = 0.6) +
stat_qq_line(color = "black") +
labs(
x = "Theoretical quantiles",
y = "Sample quantiles",
title = "Residual Q–Q plot"
) +
base_theme
#histogram of the residuals
ggplot(diag_df, aes(x = resid)) +
geom_histogram(bins = 30, fill = "grey85", color = "black") +
labs(
x = "Residuals",
y = "Count",
title = "Residual distribution"
) +
base_theme
#residual normality seems acceptable
spotify_model_simple
## Linear mixed model fit by REML ['lmerMod']
## Formula: track_popularity ~ danceability_c + (1 | track_artist)
## Data: data
## REML criterion at convergence: 296650.3
## Random effects:
## Groups Name Std.Dev.
## track_artist (Intercept) 13.68
## Residual 19.82
## Number of obs: 32828, groups: track_artist, 10692
## Fixed Effects:
## (Intercept) danceability_c
## 38.844 7.179
summary(spotify_model_simple)
## Linear mixed model fit by REML ['lmerMod']
## Formula: track_popularity ~ danceability_c + (1 | track_artist)
## Data: data
##
## REML criterion at convergence: 296650.3
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -4.0831 -0.5157 0.1104 0.6214 3.3255
##
## Random effects:
## Groups Name Variance Std.Dev.
## track_artist (Intercept) 187.2 13.68
## Residual 392.9 19.82
## Number of obs: 32828, groups: track_artist, 10692
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 38.844 0.197 197.158
## danceability_c 7.179 0.989 7.259
##
## Correlation of Fixed Effects:
## (Intr)
## danceblty_c -0.013
tab_model(spotify_model_simple, show.std = TRUE)
| track popularity | |||||
|---|---|---|---|---|---|
| Predictors | Estimates | std. Beta | CI | standardized CI | p |
| (Intercept) | 38.84 | -0.15 | 38.46 – 39.23 | -0.16 – -0.13 | <0.001 |
| danceability c | 7.18 | 0.04 | 5.24 – 9.12 | 0.03 – 0.05 | <0.001 |
| Random Effects | |||||
| σ2 | 392.90 | ||||
| τ00 track_artist | 187.23 | ||||
| ICC | 0.32 | ||||
| N track_artist | 10692 | ||||
| Observations | 32828 | ||||
| Marginal R2 / Conditional R2 | 0.002 / 0.324 | ||||
#Simple model results: Danceability alone shows a clear positive association with track popularity: more danceable tracks tend to be more popular within the same artist. However, the effect size is small (marginal R2 ~ 0.002), while artist-level differences remain large (ICC ~ 0.32), indicating that danceability by itself explains only a very small part of popularity.
#comparing the two models
anova(spotify_model_simple, spotify_model)
## refitting model(s) with ML (instead of REML)
## Data: data
## Models:
## spotify_model_simple: track_popularity ~ danceability_c + (1 | track_artist)
## spotify_model: track_popularity ~ energy_c + danceability_c + valence_c + tempo_c + speechiness_c + mode_f + (1 | track_artist)
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## spotify_model_simple 4 296659 296692 -148325 296651
## spotify_model 9 296327 296403 -148155 296309 341.33 5 < 2.2e-16
##
## spotify_model_simple
## spotify_model ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Model comparison shows that the extended model fits the data significantly better than the simple, danceability-only model (AIC and BIC values improved, and the p-value indicates a significant improvement and the extended model explains more variance, with higher marginal R² and a slightly higher conditional R²)..This suggests that track popularity depends on multiple musical features jointly, and cannot be explained by danceability alone.